As the dataset consisted of some countries which are now defucnt or merged, following operations were performed: 1. GDR,FRG,EUA were classified as GER 2. URS was classified as RUS 3. TCH was classified as CZE 4. The only countries which are not present in the updated list are of Unified team and Yugoslavia
## 'data.frame': 201 obs. of 4 variables:
## $ Country : chr "Afghanistan" "Albania" "Algeria" "American Samoa*" ...
## $ Code : chr "AFG" "ALB" "ALG" "ASA" ...
## $ Population : int 32526562 2889167 39666519 55538 70473 25021974 91818 43416755 3017712 103889 ...
## $ GDP.per.Capita: num 594 3945 4206 NA NA ...
## 'data.frame': 5770 obs. of 9 variables:
## $ Year : int 1924 1924 1924 1924 1924 1924 1924 1924 1924 1924 ...
## $ City : Factor w/ 19 levels "Albertville",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Sport : Factor w/ 7 levels "Biathlon","Bobsleigh",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Discipline: Factor w/ 15 levels "Alpine Skiing",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Athlete : Factor w/ 3761 levels "ÖBERG, Carl-Göran",..: 301 2095 2096 3472 164 1511 1512 3483 398 831 ...
## $ Country : Factor w/ 45 levels "AUS","AUT","BEL",..: 16 16 16 16 37 37 37 37 15 15 ...
## $ Gender : Factor w/ 2 levels "Men","Women": 1 1 1 1 1 1 1 1 1 1 ...
## $ Event : Factor w/ 83 levels "10000M","1000M",..: 58 58 58 58 58 58 58 58 58 58 ...
## $ Medal : Factor w/ 3 levels "Bronze","Gold",..: 1 1 1 1 2 2 2 2 3 3 ...
An issue with just applying grouping operations to dataset is that it results in values which are inflated due to consideration of each medal obtained in team events as individual medal. This results in the results getting skewed.
To mitigate this effect, I have considered to count only those which are distinct based on attributes like Year,Sport,Discipline,Event,Medal. This results in obtaining accurate values of medal counts for most of countries with minor differences for some countries.
In the all-time medal count, Norway occupies the top slot with 329 medals. In my analysis, since I have combined the different symbols for Germany and Russia, they appear on top followed by Norway with its 329 medals.
#But here the individual medals are repeatedly added in team counts for group events, so need to find a way to prevent this.
# cgp_medaltype = group_by(cgp,Country,Medal)
# cgp_mts = summarize(cgp_medaltype,count=n())
# reorderby = aggregate(count ~ Country,cgp_mts,sum)
# cgp_mts = merge(cgp_mts,reorderby,by="Country")
# colnames(cgp_mts) = c("Country","Medal","Medal_Type_Count","Total_Count")
#
# ggplot(cgp_mts,aes(x=Medal_Type_Count,y=reorder(Country,Total_Count),color=Medal)) + geom_point() + facet_grid(. ~ Medal)
#so some modifications are made while grouping to remove this anomaly
cgp_temp = cgp
cgp_temp <- within(cgp_temp, Gender[Event == 'Pairs'] <- 'Men')
cgp_temp = group_by(cgp_temp,Country,Medal)
cgp_temp = summarize(cgp_temp,count=n_distinct(Year,City,Sport,Discipline,Event,Gender))
reorderby = aggregate(count ~ Country,cgp_temp,sum)
cgp_temp = merge(cgp_temp,reorderby,by="Country")
colnames(cgp_temp) = c("Country","Medal","Medal_Type_Count","Total_Count")
cgp_mt_team = group_by(cgp,Country,Medal)
cgp_team_mts = summarize(cgp_mt_team,count=n_distinct(Year,City,Sport,Discipline,Event,Gender))
reorderby = aggregate(count ~ Country,cgp_team_mts,sum)
cgp_team_mts = merge(cgp_team_mts,reorderby,by="Country")
colnames(cgp_team_mts) = c("Country","Medal","Medal_Type_Count","Total_Count")
p = ggplot(cgp_temp,aes(x=Medal_Type_Count,y=reorder(Country,Total_Count),color=Medal)) + geom_point() + facet_grid(. ~ Medal)
interactive1 = p + ggtitle("Medal Count of each type for Countries across Years")+xlab("Medal Count")+ylab(NULL)+scale_color_manual(values = c("Bronze"=rgb(205,127,50,maxColorValue = 255),"Gold"=rgb(255,219,88,maxColorValue = 255),"Silver"="Dark Grey")) + scale_x_continuous(breaks = seq(0,175,25))
combined_summary = merge(cgp_plot1,cgp_team_mts,by="Country")
colnames(combined_summary) = c("Country","Olympics medaled in","Medal","Medal_Type_Count","Total_Count")
combined_summary_temp = merge(cgp_plot1,cgp_temp,by="Country")
colnames(combined_summary_temp) = c("Country","Olympics medaled in","Medal","Medal_Type_Count","Total_Count")
interactive1
#ggplotly(p)
summary_sorted = combined_summary[order(-combined_summary[,5]),]
rownames(summary_sorted) <- 1:nrow(summary_sorted)
top10 = summary_sorted[1:30,]
combined_subset = filter(combined,Country %in% top10$Country)
subset_gp = group_by(combined_subset,Country,Year,Gender)
subset_gps_1 = summarise(subset_gp,count=n_distinct(City,Sport,Discipline,Event,Medal))
#Men vs women comparison for United states
p1 = ggplot(filter(subset_gps_1,Country=="United States"),aes(x=Year,y=count,color=Gender))+geom_line(lwd=0.8)+xlab("Years")+ylab("Medal Count")+ggtitle("Medal Count for United States across years for Men and Women")+scale_color_manual(values =c("Blue","Pink")) + theme_minimal() + scale_x_continuous(breaks = seq(1924,2014,8))
p1
p = filter(subset_gps_1,Country=="United States") %>%
group_by(.,Gender) %>%
summarise(.,count_total = sum(count))%>%
ggplot(.,aes(x=Gender,y=count_total))+geom_bar(stat="identity",width = 0.2) + ylab("Medals won overtime by athletes of USA") + xlab("Gender") + ggtitle("Medals won by Male and Female Athletes across Years for USA") + theme_minimal()
p
#Top 10 countries total medals
#p = ggplot(top10,aes(x=Total_Count,y=reorder(Country,Total_Count))) + geom_point()+xlab("Total Count of Medals Won")+ylab("Country")+ggtitle("Top Countries with respect to Medals won")
#p
top6 = summary_sorted[1:18,]
combined_temp <- within(combined, Gender[Event == 'Pairs'] <- 'Men')
top3_c = filter(combined_temp,Country %in% top6$Country)
#Year wise medal tally for top 10 countries
medals_indi = group_by(top3_c,Country,Year)
medals_acrossyr = summarise(medals_indi,count=n_distinct(Sport,Discipline,Event,Gender,Medal))
#aggregate(count~Country,medals_acrossyr,sum)
interactive2= ggplot(medals_acrossyr,aes(x=Year,y=count,color=Country))+geom_line(lwd=0.8) + xlab("Year") + ylab("Medals won per Year") + ggtitle("Medals won across Years")+ scale_x_continuous(breaks = seq(1924,2014,8))
interactive2
combined_summary_subset = filter(combined_summary_temp[combined_summary_temp$Medal=="Gold",])
rownames(combined_summary_subset) <- 1:nrow(combined_summary_subset)
small_df = subset(combined_temp,select=c("Country","Population","GDP.per.Capita"))
part2 = merge(combined_summary_subset,small_df,by="Country")
part2 = part2[!duplicated(part2),]
rownames(part2) = 1:nrow(part2)
#Unnormalized Plot based on Gold Medals
ggplot(part2,aes(x=Medal_Type_Count,y=reorder(Country,Medal_Type_Count)))+geom_point(color="Blue",size=2.5,alpha=0.6) + xlab("Gold Medal Count") + ylab("Country") + ggtitle("Unadjusted ranking based on gold medal count") + theme_economist()+ scale_x_continuous(breaks = seq(0,175,25))
#Adjusted based on gold medals + population
sdf = part2[order(-part2[,4],-part2[,6]),]
ggplot(sdf,aes(x=Medal_Type_Count,y=Country))+geom_point(color="Blue",size=2.5,alpha=0.6) + xlab("Gold Medal Count") + ylab("Country") + ggtitle("Adjusted ranking based for gold medal count based on population") + theme_economist()+ scale_x_continuous(breaks = seq(0,175,25))
#Adjusted based on GDP
ggplot(sdf,aes(x=Medal_Type_Count,y=reorder(Country,GDP.per.Capita,na.rm=TRUE)))+geom_point(color="Blue",size=2.5,alpha=0.6) + xlab("Gold Medal Count") + ylab("Country") + ggtitle("Adjusted ranking for gold medal count based on GDPperCapita")+ theme_economist()+ scale_x_continuous(breaks = seq(0,175,25))
Generally the countries that have a higher GDP per capita tend to do well overallat winter olympics and more specifically with respect to their Gold Medal Count, in this case, for examples countries like Switzerland and Sweden
library(rvest)
library(stringr)
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/Winter_Olympic_Games")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[5]], fill=TRUE)
hosts <- hosts[-1,1:3]
hosts$city <- str_split_fixed(hosts$Host, n=2, ",")[,1]
hosts$country <- str_split_fixed(hosts$Host, n=2, ",")[,2]
cc = data_frame(hosts$Year,hosts$city,hosts$country)
colnames(cc) = c("Year","City","Host Country")
cc$City[cc$City=="St. Moritz"] = "St.Moritz"
cc$City[cc$City=="Garmisch-Partenkirchen"] = "Garmisch Partenkirchen"
cc = cc[1:24,]
cc = cc[-c(5,6),]
combined_cc = merge(combined_temp,cc,by=c("Year","City"))
combined_cc$`Host Country` = str_trim(combined_cc$`Host Country`,"left")
host_adv = subset(combined_cc,Country==`Host Country`)
host_adv_gp = group_by(host_adv,Country,Year)
host_adv_gps = summarize(host_adv_gp,count=n_distinct(Year,City,Sport,Discipline,Event,Gender,Medal))
reorderby_host = aggregate(count ~ Country,host_adv_gps,sum)
host_adv_gps = merge(host_adv_gps,reorderby_host,by="Country")
colnames(host_adv_gps) = c("HC Country","Year","Medal_Type_Count_HC","Medals when hosted")
host_total_medals = subset(host_adv_gps,select=c("HC Country","Medals when hosted"))
host_total_medals = host_total_medals[!duplicated(host_total_medals),]
#Number of times olympics hosted
hc_year = subset(host_adv_gps,select=c("HC Country","Year"))
hc_year = group_by(hc_year,hc_year$`HC Country`)
hc_year = summarise(hc_year,count=n())
host_total_medals$`Medals when hosted` = host_total_medals$`Medals when hosted`/hc_year$count
host_total_medals$`Medals when hosted` = as.integer(host_total_medals$`Medals when hosted`)
total_medals = subset(combined_summary_temp,select=c("Country","Olympics medaled in","Total_Count"))
total_medals = total_medals[!duplicated(total_medals),]
colnames(host_total_medals) = c("Country","Medals when hosted")
host_combined = merge(host_total_medals,total_medals,by="Country")
colnames(hc_year) = c("Country","count")
host_combined = merge(host_combined,hc_year,by="Country")
host_combined["Remaining_participation"] = host_combined$`Olympics medaled in`- host_combined$count
host_combined["Average_Rest"] = host_combined$Total_Count/host_combined$Remaining_participation
host_combined$Average_Rest = as.integer(host_combined$Average_Rest)
interactive3 = ggplot(host_combined,aes(x=host_combined$`Medals when hosted`,y=host_combined$`Average_Rest`,text=paste("Country: ",host_combined$Country)))+geom_point(alpha=0.5,size=2.5)+xlab("Medals won when hosted the games")+ geom_abline(intercept = 0,slope = 1) + ylab("Mean value of medals won for rest of the olympics")+ggtitle("Comparative Study of Host Country Advantage") + xlim(0,90) + ylim(0,90) + theme_minimal()
interactive3
This visualization allows us to contrast the total medals achieved when a country was hosting the olympics to the average of all medals obtained when the country was not hosting. Although the analysis needs more scrutiny, the plot suggests an evidence or pattern that suggests host country advantage, as many of the observations fall on the right side of the 45 degree line in the scatterplot, thereby indicating influence of host country advantage on the medal tally.
For this analysis, I have selected the country of Norway. The next two visualizations depict the events and sports respectively that have helped Norway obtain most number of medals.
snorway = combined_temp %>% filter(Country=='Norway')
sngp = group_by(snorway,Sport,Discipline,Event)
sngps = summarise(sngp,count=n_distinct(Year,Gender,Medal))
sngps = sngps[order(-sngps$count),]
sngps['cumsum_count'] = cumsum(sngps$count)
top10_ne = sngps[1:10,]
ggplot(top10_ne,aes(x=reorder(Event,count),y=count)) + geom_bar(stat="identity") + coord_flip() + xlab("Event which have brought most medals") + ylab("Count") + ggtitle("Events which have brought most medals to Norway") + theme_minimal()
These top 10 events have brought half of Norway’s total medals.
sngp_sport = group_by(snorway,Sport)
sngpss = summarise(sngp_sport,count=n_distinct(Discipline,Event,Year,Gender,Medal))
sngps_sports = sngpss[order(-sngpss$count),]
sngps_sports['cumsum_count'] = cumsum(sngps_sports$count)
sngps_sports['Percent_medals_brought'] = sngps_sports['count']*100/329
ggplot(sngps_sports,aes(x=reorder(Sport,Percent_medals_brought),y=Percent_medals_brought)) + geom_bar(stat="identity",width = 0.5) + coord_flip() + xlab("Sports") + ylab('Percent of total medals brought in by specific sport (%)') + ggtitle("Percent of total medals brought by Sports") + theme_minimal()
This is an interesting result which describes that Skiing has brought more than 60% of the total 329 medals for Norway.
In this part, for analysing the performance of athletes, I have used the following metrics:
1. Most Successful athletes in terms of Total Medals obtained. 2. Athletes who have participated in most olympics across years. 3. Most Successful athletes in terms of Total Medals obtained categorized by Sports.
ath = group_by(combined_temp,Athlete)
aths = summarise(ath,count=n_distinct(Year,City,Sport,Discipline,Event,Medal))
aths = aths[order(-aths$count),]
aths = aths[1:10,]
ggplot(aths,aes(x=reorder(Athlete,count),y=count)) + geom_bar(stat="identity") + coord_flip() + xlab("Top Athletes") +ylab("Medals won by them") + ggtitle("Most Successful Athletes") + theme_minimal()
ath_y = group_by(combined_temp,Athlete)
ath_ys = summarise(ath_y,count=n_distinct(Year))
ath_ys = ath_ys[order(-ath_ys$count),]
ggplot(ath_ys[1:20,],aes(x=reorder(Athlete,count),y=count))+geom_bar(stat="identity")+coord_flip()+xlab("Top Athletes wrt participation") + ylab("Number of winter olympics participated") + ggtitle("Athletes who have participated the most number of times in olympics") + theme_minimal()
ath_spg = group_by(combined_temp,Sport,Athlete)
ath_spg = summarize(ath_spg,count=n_distinct(Year,Discipline,Event,Medal))
ath_spg = ath_spg[order(-ath_spg$count,ath_spg$Sport),]
ggplot(ath_spg[1:15,],aes(x=reorder(interaction(Sport,Athlete),count),y=count)) + geom_point(alpha=0.5,size=2.5,color=c("blue")) + coord_flip() + xlab("Top Medal wining Athletes by Sport") + ylab("Number of medals won") +ggtitle("Top Medal wins by athletes categorized by Sports") + theme_minimal()
I have added interactivity for this plot as adding interactivity allows a user to view the statistics of a country (individual medal tally and total medal tally) that he is interested in carefully, thus allowing him to gain valuable information through the visualization.
ggplotly(interactive1)
I have added interactivity for this visualization as this allows a person to trace the respective medal counts for top countries across years. This allows a comparative study not only across years but also across countries for a specific year. This is an added advantage of this visualization.
ggplotly(interactive2)
Adding interactivity to this plot allows us to view the medal tallies for host nations. This allows us to contrast the total medals achieved when a country was hosting olympics to the average of all medals obtained when the country was not hosting.
ggplotly(interactive3)
#interactive3
This Data Table adds interactivity to the normal data frame, allowing us to perform diferent operations like searching, sorting on the data frame dealing with medals obtained by countries.
for_dt = subset(combined_summary_subset,select=c("Country","Olympics medaled in","Medal_Type_Count","Total_Count"))
colnames(for_dt) = c("Country","Olympics_medaled_in","Gold_Medals","Total_Medals")
datatable(for_dt)